### data prep for all three surveys 


#################################################################################
#### begin basic data processing, i.e. pulling data out of the raw BES files #### 
#################################################################################


##############
#### 2005 ####
##############

library(foreign)

d <- read.dta("2005BESinternet.dta") # N = 7793
d = d[!is.na(d$post_w8), ]

##### Get N of post-election wave ####

sum(!is.na(d$post_q19))
table(d$post_q19)


##### Party feeling scores ("Extra feeling scores" -- not sure what Extra means) #####

## get the pre- wave feeling scores: pre_85-90 (includes ukip, greens) 
vars = paste0("pre_q", c(84:90))
parties = c("lab", "con", "ld","grn","ukip","snp", "pc")   # BNP not included in 2005 questionnaire. frustrating. 

for(i in 1:length(vars)){
  newname = paste0(parties[i], "feel")
  d[[newname]] = d[[vars[i]]]
  d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
  d[[newname]] = as.numeric(d[[newname]])
  d[[newname]][d[[newname]] > 10] = NA
}


## post-wave feeling scores (not included for grn and ukip) 
varspost = c(paste0("post_q", c(35:37)), NA, NA, paste0("post_q", c(38:39)))

for(i in 1:length(varspost)){
  newname = paste0(parties[i], "feelpost")
  if(is.na(varspost[i])){
    d[[newname]] = NA
    next
  }
  d[[newname]] = d[[varspost[i]]]
  d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
  d[[newname]] = as.numeric(d[[newname]])
  d[[newname]][d[[newname]] > 10] = NA
}

# party leader scores 
# pre_q68, post_q27 Tony Blair  (lab)
# pre_q69, post_q28 Michael Howard (con)
# pre_q70, post_q29 Charles Kennedy (ld)
# pre_q71, post_q30 Alex Salmond (snp)
# pre_q72, post_q31 Elfyn Llwyd (pc)

leader.parties = c("lab", "con", "ld", "snp", "pc")
leader.nums = cbind(68:72, 27:31)

for(i in 1:length(leader.parties)){
  for(j in 1:2){
    newname = paste0(leader.parties[i], "leaderfeel", c("", "post")[j])
    oldname = paste0(c("pre_q", "post_q")[j], leader.nums[i,j])
    d[[newname]] = d[[oldname]]
    d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
    d[[newname]] = as.numeric(d[[newname]])
    d[[newname]][d[[newname]] > 10] = NA
  }
}



# party ID, including squeeze

Sys.setlocale('LC_ALL','C') 

d$partyID = d$pre_q29
d$partyIDpost = d$post_q13
d$partyIDsqueeze = d$pre_q31
d$partyIDsqueezepost = d$post_q15

for(var in c("partyID", "partyIDpost", "partyIDsqueeze", "partyIDsqueezepost")){
  d[[var]] = as.character(d[[var]])
  d[[var]][grepl("none", d[[var]])] = "none"
  d[[var]][grepl("know", d[[var]])] = "don't know"
}

d$partyIDstrength = as.character(d$pre_q33)
d$partyIDstrengthpost = as.character(d$post_q17)


# likely outcomes 
vars = c(paste0("cam_q", 33:37))
parties = c("lab", "con", "ld", "snp", "pc")

for(i in 1:length(vars)){
  newname = paste0(parties[i], "_win_campaign")
  d[[newname]] = d[[vars[i]]]
  d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
  d[[newname]] = as.numeric(d[[newname]])
}


# election results in common format 
for(party in parties){
  d[[paste0(party, ".vs.prev")]] = d[[paste0(party, "01")]]
  d[[paste0(party, ".vs")]] = d[[paste0(party, "05")]]
}



###### Coding the voting outcome ##### 


d$vote.pre = d$pre_q36 
d$vote.post = d$post_q20

d$did.not.vote.post = (d$post_q19 == "no")
d$did.not.vote.post[d$post_q19 == "refuse"] = NA


d$reasonForVote = d$post_q23
d$reasonForVote[d$reasonForVote == 1] = "The party had the best policies"
d$reasonForVote[d$reasonForVote == 2] = "The party had the best leader"
d$reasonForVote[d$reasonForVote == 3] = "I really preferred another party but it stood no chance of w"
d$reasonForVote[d$reasonForVote == 4] = "I voted tactically"
d$reasonForVote[d$reasonForVote == 5] = "Other reasons"

d$reasonForVoteOth = d$post_q24 # we were missing this before but actually it's here! 
d$partyReallyPrefer = d$post_q25
d$otherPartyReallyPrefer = d$post_q26


#### Now: extract some predictors 

library(gtools)
library(car)
library(arm)

# knowledge did not appear in 2005 internet survey, it seems
know = F
if(know){
  knowvars <- paste0("bq79_", 1:8) # "polKnow", c("Miliband", "Clegg", "Osborne", "May", "Bercow"))
  correctanswers <- c(T, T, F, F, F, F, T, T)
  
  # c("Leader of the Labour Party", "Deputy Prime Minister", 
  #                                  "Chancellor of the Exchequer", "Home secretary", 
  #                                  "Speaker of the House of Commons") 
  knowdf <- d[,knowvars]
  for(i in 1:length(knowvars)){
    knowdf[,i] <- as.numeric(knowdf[,i] == correctanswers[i])
  }
  names(knowdf) <- paste0(names(knowdf), "Dummy")
  
  d$ukpolsKnowSum <- rowSums(knowdf, na.rm = TRUE)
  d$ukpolsKnowMean <- rowMeans(knowdf, na.rm = TRUE)
  
  d$ukpolknow <- d$ukpolsKnowMean
}


##### BES knowledge battery: international politicians
# not in 2005 BES apparently 



#### Knows MP or not
# not in 2005 BES apparently 



#### Self-reported consumption of political news in media
# there is some stuff but doesn't match up exactly to 2015 questions 

# only election interest and attention to politics
d$interest <- 5-as.numeric(d$post_q18)
d$interest[d$interest<1] <- NA
table(d$post_q18, d$interest, useNA="ifany")
d$polintcat <- quantcut(d$interest, q=seq(0,1,length.out = 3), na.rm=TRUE,
                        labels = c("low", "high"))
d$polintlow <- as.numeric(d$polintcat == "low")
d$polinthigh <- as.numeric(d$polintcat == "high")



d$attention <- d$post_q49
d$polattncat <- quantcut(d$attention, q=seq(0,1,length.out = 4), na.rm=TRUE,
                         labels = c("low", "med", "high"))
d$polattnlow <- as.numeric(d$polattncat == "low")
d$polattnmed <- as.numeric(d$polattncat == "med")
d$polattnhigh <- as.numeric(d$polattncat == "high")
d$polattn.above.median <- as.numeric(quantcut(d$attention, q=seq(0,1,length.out = 3), na.rm=TRUE,
                                              labels = c("no", "yes")) == "yes")



#### Has the respondent been contacted by a party?

# in BES 2005 post-election wave this is measured as whether party has canvassed or telephoned r
d$partycanvass <- as.numeric(d$post_q52 == "yes")
d$partyphone <- as.numeric(d$post_q54 == "yes")

# create summary measure of convass or telephone
d$partycontact <- as.numeric(rowSums(d[,c("partycanvass", "partyphone")]) > 0)


#### Personality traits (Big 5)
# not in BES 2005




#### Gender
d$female <- as.numeric(d$pre_q180 == "female")


#### Agegroup
d$yobch <- as.character(d$pre_q148)
d$yobch[d$yobch == "1900 or earlier"] <- "1900"
d$yobch[d$yobch == "abstain"] <- NA
d$yob <- as.numeric(as.character(d$yobch))

d$age <- 2005 - d$yob

# three groups
d$agegrp <- factor(3 - (d$age <= 29) - (d$age <= 59),
                   labels = c("below30", "30to59", "60plus"))
d$agegrp.alt <- factor(3 - (d$age <= 29) - (d$age <= 49),
                       labels = c("below30", "30to49", "50plus"))
# dummy for 50+
d$agefiftyplus <- as.numeric(d$age >= 50)


#### Income group

# note several don't knows and prefer not to answers here

### income
d$income.processed = gsub("[^-0-9]", "", iconv(d$pre_q163, "UTF8", "ASCII", sub = ""), perl = T)
d$incomealt <- car:::recode(d$income.processed, 
                            'c("5000", "5001-10000", "5000-10000", "10001-15000", "15001-20000") = "low";
                            c("20001-25000", "25001-30000", "30001-35000", "35001-40000", "40001-45000", "45001-50000") = "med"; c("50001-60000", "60001-70000", "70001-80000", "80001-90000", "90001-100000", "100000") = "high"; c("", "dk", "na") = NA')

d$incomealtlow <- d$incomealt == "low"
d$incomealtmed <- d$incomealt == "med"
d$incomealthigh <- d$incomealt == "high"

# 
d$incomeband = as.integer(gsub("\\-.+", "", d$income.processed))
d$income.above.median = d$incomeband > median(d$incomeband, na.rm = T)
d$income.above.median[is.na(d$incomeband)] = NA

d$income <- quantcut(d$incomeband, q = 3, na.rm = TRUE)
d$incomelow <- d$income == levels(d$income)[1]
d$incomemed <- d$income == levels(d$income)[2]
d$incomehigh <- d$income == levels(d$income)[3]



#### Class
# skipping in BES2010 as well 

# Almost half of respondents have missing scores on ns_sec_amalytic
# need to look into this measure more to find out why




#### Education level

### Education

d$education = gsub("\\*","star", as.character(d$pre_q156))
d$education[grep("gcse astar", d$education)] <- "gcse astar to c etc"
d$education[grep("gcse grades d", d$education)] <- "gcse d to g etc"

# note people who are NA are people who answer no qualifications in pre_q155

# d$education.bak <- besdat$education
d$qualifications <- car:::recode(d$education,  # many questions in here 
                                 "NA='No qualifications'; 
                                 'youth training certificate, skill seekers'='Level 2';
                                 'recognised trade apprenticeship completed'='Level 2';
                                 'clerical and commercial qualification (e.g., typing, shortha'='Level 1';
                                 'city & guilds level 1, scotvec national certificate modules,'='Level 1';
                                 'city & guilds level 2 (craft/intermediate/ordinary) or scotv'='Level 2';
                                 'ordinary national certificate (onc) or diploma (ond), city &'='Level 2';
                                 'gcse d to g etc'='Level 1';
                                 'gcse astar to c etc'='Level 2';
                                 'scottish ordinary bands a-c or pass, scottish standard grade'='Level 2';
                                 'gce a level, s level, a2 level, as level, international bacc'='Level 3';
                                 'higher national certificate (hnc) or higher national diploma'='Level 3';
                                 'scottish higher/higher still grades, scottish slc/supe at hi'='Level 3';
                                 'nursing qualification (e.g., sen, src, scm, rgc)'='Level 4/5';
                                 'teaching qualification (not degree)'='Level 4/5';
                                 'university or cnaa diploma'='Level 4/5';
                                 'university or cnaa first degree, e.g, ba, bsc'='Level 4/5';
                                 'postgraduate degree'='Level 4/5';
                                 'other technical, professional or higher qualification'='Other';
                                 else = NA")


# table(besdat$qualifications, useNA="ifany")

# dummy for degree or higher
d$educuni <- as.numeric(d$qualifications == "Level 4/5")

# dummy for level 3 or higher
d$educl3plus <- as.numeric(d$qualifications %in% c("Level 4/5", "Level 3"))


##### Now code tax-spend attitude

#Now, another issue. Using the 0 to 10 scale below, where the end marked 0 means that
#government should cut taxes a lot and spend much less on health and social services, and the
#end marked 10 means that government should raise taxes a lot and spend much more on health
#and social services, where would you place yourself on this scale?

d$taxspend <- as.numeric(d$pre_q118)
#d$taxspend[d$taxspend>11] <- NA # treat Don't know (=12) as NA
# re-center at 0 and flip polarity
d$taxspend <- -1*(d$taxspend - 6)

# and group
d$taxspendgroup <- car:::recode(d$taxspend,
                                "-5:-2 = 'left';
                                -1:1 = 'center';
                                2:5 = 'right'",
                                as.factor = TRUE,
                                levels = c("left","center","right"))

d$taxspendgroupleft <- d$taxspendgroup == "left"
d$taxspendgroupcenter <- d$taxspendgroup == "center"
d$taxspendgroupright <- d$taxspendgroup == "right"

d$refno = d$pa05
d$id = d$besid # serialno
d$year = 2005

#### See Nick memo for some summaries, shortcomings of these. 
d2005_big = d
to.keep = c(1, which(names(d) %in% c("pa05", "region")), which(names(d) == "labfeel"):ncol(d))
d2005 = d[,to.keep]



################
##### 2010 #####
################


## 20160807 rewritten to take out the Myatt Fisher stuff -- that's in the archive version.

d = read.dta("cipsdec2311.dta") # very big. this is the "Campaign internet panel survey"

# drop people who were not in the post-election wave
d = d[!is.na(d$post_w8), ]

##### Get N of post-election wave ####

sum(!is.na(d$ccq24))


##### Get the party feeling scores: pre is aaq63-70 (includes bnp, ukip, greens); campaign is bbq48-52 (only 5 parties); post is ccq63-65 and ccq69-70 (again only five parties)

# pre-election # 

vars = paste0("aaq", 63:70)
parties = c("lab", "con", "ld", "grn", "ukip", "bnp", "snp", "pc")

for(i in 1:length(vars)){
  newname = paste0(parties[i], "feel")
  d[[newname]] = d[[vars[i]]]
  d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
  d[[newname]] = as.numeric(d[[newname]])
}

# party.feels = paste0(parties, "feel") 

# post-election
vars = c(paste0("ccq", 63:65), rep(NA, 3), paste0("ccq", 69:70))  # previously using pre-election for grn ukip bnp here 
parties = c("lab", "con", "ld", "grn", "ukip", "bnp", "snp", "pc")

for(i in 1:length(vars)){
  newname = paste0(parties[i], "feelpost")
  if(is.na(vars[i])){
    d[[newname]] = NA    
  }else{
    d[[newname]] = d[[vars[i]]]
    d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
    d[[newname]] = as.numeric(d[[newname]])
  }
}

## party leader feeling scores ## 
## aaq52, ccq52 Gordon Brown   (lab)
## aaq53, ccq53 David Cameron  (con)
## aaq54, ccq54 Nick Clegg     (ld)
## aaq55, ccq55 Alex Salmond   (snp)
## aaq56, ccq56 Ieuan Wyn Jones(pc)


# post-election
vars = paste0("q", 52:56) 
parties = c("lab", "con", "ld", "snp", "pc")

for(i in 1:length(vars)){
  for(j in 1:2){
    newname = paste0(parties[i], "leaderfeel", c("", "post")[j])
    d[[newname]] = d[[paste0(c("aa", "cc")[j], vars[i])]]
    d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
    d[[newname]] = as.numeric(d[[newname]])
  }
}


# party.feelspost = paste0(parties, "feelpost") 

Sys.setlocale('LC_ALL','C') 

# Get the party ID variables for the 2010 BES
## aaq28 id aaq31 closer, bbq18 id, bbq20 close, ccq28 id, ccq30 feel close
d$partyID = d$aaq28
d$partyIDpost = d$ccq28
d$partyIDsqueeze = d$aaq30
d$partyIDsqueezepost = d$ccq30
d$partyIDstrength = d$aaq32
d$partyIDstrengthpost = d$ccq32

for(var in c("partyID", "partyIDpost", "partyIDsqueeze", "partyIDsqueezepost")){
  d[[var]] = as.character(d[[var]])
  d[[var]][grepl("none", d[[var]])] = "none"
  d[[var]][grepl("know", d[[var]])] = "don't know"
  d[[var]][grepl("skipped", d[[var]])] = NA
  d[[var]][grepl("asked", d[[var]])] = NA
}



# electoral results in common format 
for(party in parties){
  d[[paste0(party, ".vs.prev")]] = d[[paste0(party, "05")]]
  d[[paste0(party, ".vs")]] = d[[paste0(party, "10")]]
}

d$grnvt10 = d$greenvt1



# beliefs about outcomes -- we have pre and campaign-wave beliefs 

vars = c(paste0("aaq", 43:48), paste0("bbq", 33:37))
parties = c("lab", "con", "ld", "snp", "pc", "other", "lab", "con", "ld", "snp", "pc")

for(i in 1:length(vars)){
  newname = paste0(parties[i], "_win", ifelse(grepl("bbq", vars[i]), "_campaign", "_precampaign"))
  d[[newname]] = d[[vars[i]]]
  d[[newname]] = gsub("[^\\d]", "", d[[newname]], perl = T)
  d[[newname]] = as.numeric(d[[newname]])
}


# outcome and reasons  

d$did.not.vote.post = (d$ccq24 == "no, did not vote")
d$did.not.vote.post[d$ccq24 == "don't know"] = NA # what is wrong with these people
d$vote.post = d$ccq25

d$reasonForVote = d$ccq39 # bq12_4
d$reasonForVoteOth = d$ccq40 # bq12_4
d$partyReallyPrefer = d$ccq41
d$otherPartyReallyPrefer = d$ccq42

#### Now: extract some predictors 

know = F
if(know){
  knowvars <- paste0("bq79_", 1:8) # "polKnow", c("Miliband", "Clegg", "Osborne", "May", "Bercow"))
  correctanswers <- c(T, T, F, F, F, F, T, T)
  
  # c("Leader of the Labour Party", "Deputy Prime Minister", 
  #                                  "Chancellor of the Exchequer", "Home secretary", 
  #                                  "Speaker of the House of Commons") 
  knowdf <- d[,knowvars]
  for(i in 1:length(knowvars)){
    knowdf[,i] <- as.numeric(knowdf[,i] == correctanswers[i])
  }
  names(knowdf) <- paste0(names(knowdf), "Dummy")
  
  d$ukpolsKnowSum <- rowSums(knowdf, na.rm = TRUE)
  d$ukpolsKnowMean <- rowMeans(knowdf, na.rm = TRUE)
  
  d$ukpolknow <- d$ukpolsKnowMean
}


##### BES knowledge battery: international politicians
# not in 2010 BES apparently 



#### Knows MP or not
# not in 2010 BES apparently 



#### Self-reported consumption of political news in media
# there is some stuff but doesn't match up exactly to 2015 questions 



#### self-reported political attention 
# note, leave interest in election variable as measured on four levels and more than 50% of observations lie at maximum level.

d$attention <- as.numeric(d$ccq131)-1
d$attention[d$ccq131 %in% c("dk", "not asked")] <- NA
d$polattncat <- quantcut(d$attention, q=seq(0,1,length.out = 4), na.rm=TRUE,
                         labels = c("low", "med", "high"))
d$polattnlow <- as.numeric(d$polattncat == "low")
d$polattnmed <- as.numeric(d$polattncat == "med")
d$polattnhigh <- as.numeric(d$polattncat == "high")
d$polattn.above.median <- as.numeric(quantcut(d$attention, q=seq(0,1,length.out = 3), na.rm=TRUE,
                                              labels = c("no", "yes")) == "yes")




#### Has the respondent been contacted by a party?
d$partycontact <- as.numeric(d$ccq66 == "yes") # bq86_1 == "yes")


#### Personality traits (Big 5)
# not in BES 2010




#### Gender
d$female <- as.numeric(d$ccq100 == "female")  # bq88


#### Agegroup

d$age = 2010 - as.numeric(as.character(d$cc_q98))

# three groups
d$agegrp <- factor(3 - (d$age <= 29) - (d$age <= 59),
                   labels = c("below30", "30to59", "60plus"))
d$agegrp.alt <- factor(3 - (d$age <= 29) - (d$age <= 49),
                       labels = c("below30", "30to49", "50plus"))

# dummy for 50+
d$agefiftyplus <- as.numeric(d$age >= 50)


#### Income group

# note several don't knows and prefer not to answers here

### income
d$income.raw <- d$aaq166  # bq96 # gsub("Â£","",
d$income.processed = gsub("[^\\-\\d]", "", d$income.raw, perl = T) # as.character(besdat$profile_gross_household))
d$incomealt <- car:::recode(d$income.processed, 
                            'c("5000", "5001-10000", "5000-10000", "10001-15000", "15001-20000") = "low";
                            c("20001-25000", "25001-30000", "30001-35000", "35001-40000", "40001-45000", "45001-50000") = "med"; c("50001-60000", "60001-70000", "70001-80000", "80001-90000", "90001-100000", "100000") = "high"; c("", "dk", "na") = NA')

d$incomealtlow <- d$incomealt == "low"
d$incomealtmed <- d$incomealt == "med"
d$incomealthigh <- d$incomealt == "high"

d$incomeband = as.integer(gsub("\\-.+", "", d$income.processed))
d$income.above.median = d$incomeband > median(d$incomeband, na.rm = T)
d$income.above.median[is.na(d$incomeband)] = NA

d$income <- quantcut(d$incomeband, q = 3, na.rm = TRUE)
d$incomelow <- d$income == levels(d$income)[1]
d$incomemed <- d$income == levels(d$income)[2]
d$incomehigh <- d$income == levels(d$income)[3]



#### Class
# skipping in BES2010 as well 

# Almost half of respondents have missing scores on ns_sec_amalytic
# need to look into this measure more to find out why




#### Education level

### Education

d$education = d$aaq159 #  bq95_3
# d$education.bak <- besdat$education
d$educuni = d$education %in% c("postgraduate degree", "university or cnaa first degree, e.g, ba, bsc", "university or cnaa diploma", "teaching qualification (not degree)", "nursing qualification (e.g., sen, src, scm, rgc)")
d$educl3plus = d$educuni | d$education %in% c("gce a level, s level, a2 level, as level, international bacc", "higher national certificate (hnc) or higher national diploma", "scottish higher/higher still grades, scottish slc/supe at hi")
d$qualifications = NA
d$qualifications[d$educuni] = "Level 4/5"
d$qualifications[d$educl3plus & !d$educuni] = "Level 3"
d$qualifications[d$education %in% c(
  "city & guilds level 2 (craft/intermediate/ordinary) or scotv", 
  "ordinary national certificate (onc) or diploma (ond), city &", 
  "gcse a*-c, cse grade 1, gce o level grade a –c, school cer", 
  "scottish ordinary bands a-c or pass, scottish standard grade",
  "youth training certificate, skill seekers", "recognised trade apprenticeship completed")] = "Level 2"
d$qualifications[d$education %in% c(
  "gcse grades d-g, cse grades 2-5, gce o level grade d –e, s", "clerical and commercial qualification (e.g., typing, shortha", "city & guilds level 1, scotvec national certificate modules,")] = "Level 1"
d$qualifications[d$education %in% c(
  "other technical, professional or higher qualification")] = "Other"

d$educuni[is.na(d$education)] = NA
d$educl3plus[is.na(d$education)] = NA



##### Now code tax-spend attitude

#Using the 0 to 10 scale below, where the end marked 0 means that government should <i>cut
#taxes a lot and spend much less on health and social services</i>, and the end marked 10 means
#that government should <i>raise taxes a lot and spend much more on health and social
#services</i>, where would you place <i>yourself</i> on this scale?

d$taxspend <- as.numeric(d$aaq104)
d$taxspend[d$taxspend>11] <- NA # treat Don't know (=12) as NA
# re-center at 0
d$taxspend <- -1*(d$taxspend - 6)

# and group
d$taxspendgroup <- car:::recode(d$taxspend,
                                "-5:-2 = 'left';
                                -1:1 = 'center';
                                2:5 = 'right'",
                                as.factor = TRUE,
                                levels = c("left","center","right"))

d$taxspendgroupleft <- d$taxspendgroup == "left"
d$taxspendgroupcenter <- d$taxspendgroup == "center"
d$taxspendgroupright <- d$taxspendgroup == "right"

d$id = d$aaid
d$year = 2010

#### See Nick memo for some summaries, shortcomings of these. 

d2010_big = d
d$region = d$aaq1
to.keep = c(1:2, which(names(d) %in% c("refno", "region")), which(names(d) == "labfeel"):ncol(d))
d2010 = d[,to.keep]



###################
##### 2015 ########
###################


#### measure matchups from 2010 (Nick's code, slightly modified)
besw2 <- suppressWarnings(read.spss("BES2015_W2_Panel_v2.0_1.sav", to.data.frame=TRUE))

vote10dat <- besw2[,c("pcon", "pconrefno", "pconcon10", "pconlab10", "pconld10", "pconsnp10", "pconpc10", "pconukip10", "pcongrn10", "pconbnp10")]
matchup.labs <- c("Con", "Lab", "LD", "SNP", "PC", "UKIP", "Grn", "BNP")
vote10dat <- subset(vote10dat, !duplicated(pconrefno))  # one row per constituency
names(vote10dat) = c("conname", "pano", gsub("pcon", "", names(vote10dat)[-(1:2)]))
vote10dat = vote10dat[!is.na(vote10dat$pano), ]


# 2015 results 
res2015 = read.dta("BES-2015-General-Election-results-file-v2.0_old.dta")
# results1015 = merge()
res2015$BNP15 = 100*res2015$BNPVote15/res2015$TotalVote15

vote1015 = merge(vote10dat, res2015[,c("pano","Country", "Region", "ConstituencyType", "Con15", "Lab15", "LD15", "SNP15", "PC15", "UKIP15", "Green15", "BNP15", "Other15")], by = "pano")
parties = c("con", "lab", "ld", "snp", "pc", "ukip", "grn", "bnp", "other")
names(vote1015)[14:ncol(vote1015)] = paste0(parties, "15")


# load BES data wave 1-6 combined
d = suppressWarnings(read.spss("BES2015_W6_Panel_v1.2.sav", to.data.frame=TRUE)) # takes a long time  
besdat = subset(d,!is.na(wt_core_W6))

# merge in matchups
besdat = merge(besdat, vote1015, by = "pano", all.x=TRUE) # refno


##### Get N of post-election wave ####

nrow(d)


##### Get the feeling scores  ####  

parties = c("Con", "Lab", "LD", "SNP", "PC", "UKIP", "Grn", "BNP")
leaders = c("Cameron", "Miliband", "Clegg", "Sturgeon", "Wood", "Farage", "Bennett", NA)

waves = c(4, 6)
suffixes = c("", "post")

for(j in 1:length(parties)){
  party = parties[j]
  leader = leaders[j]
  for(i in 1:length(waves)){ 
    # party like/dislike
    oldname = paste0("like", party, "W", waves[i])
    newname = paste0(tolower(party), "feel", suffixes[i])
    if(oldname %in% names(besdat)){
      besdat[[newname]] = as.character(besdat[[oldname]])
      besdat[[newname]][besdat[[newname]] == "Don't know"] = NA
      besdat[[newname]][besdat[[newname]] == "Strongly like"] = 10
      besdat[[newname]][besdat[[newname]] == "Strongly dislike"] = 0
    }else{
      cat(oldname, "not found.\n")
    }
    # leader like/dislike
    oldname = paste0("like", leader, "W", waves[i])
    newname = paste0(tolower(party), "leaderfeel", suffixes[i])
    if(oldname %in% names(besdat)){
      besdat[[newname]] = as.character(besdat[[oldname]])
      besdat[[newname]][besdat[[newname]] == "Don't know"] = NA
      besdat[[newname]][besdat[[newname]] == "Strongly like"] = 10
      besdat[[newname]][besdat[[newname]] == "Strongly dislike"] = 0
    }else{
      cat(oldname, "not found.\n")
    }
  }
}


# Get N of post-election wave that receive like-dislike questions

sum(apply(besdat[,paste(tolower(parties), "feelpost", sep = "")], 1, function(x){all(is.na(x)) == FALSE}))


# previously used pre for post for three parties here. 


# Now get stated party preference. 


## 
besdat$partyID = besdat$partyIdW4
besdat$partyIDpost = besdat$partyIdW6
besdat$partyIDsqueeze = besdat$partyIdSqueezeW4
besdat$partyIDsqueezepost = besdat$partyIdSqueezeW6
besdat$partyIDstrength = tolower(as.character(besdat$partyIdStrengthW4)) 
besdat$partyIDstrengthpost = tolower(as.character(besdat$partyIdStrengthW6)) 

# likelihood of different outcomes 

party.names = c("Con", "Lab", "LD", "UKIP", "Green", "SNP", "PC")
vars = paste0("winConstituency", party.names)  # 0-100! 
parties = c("con", "lab", "ld", "ukip", "grn", "snp", "pc") # that was my error! 

for(i in 1:length(vars)){
  for(wave in c(4,5)){
    newname = paste0(parties[i], "_win", ifelse(wave == 4, "_precampaign", "_campaign"))
    besdat[[newname]] = besdat[[paste0(vars[i], "W", wave)]]
    besdat[[newname]] = gsub("[^\\d]", "", besdat[[newname]], perl = T)
    besdat[[newname]] = as.numeric(besdat[[newname]])   
    besdat[[newname]][besdat[[newname]] > 100] = NA 
  }
}


###### Coding the outcome ##### 


besdat$vote.post = besdat$generalElectionVoteW6 # when they were asked this, I guess 
besdat$did.not.vote.post = besdat$genElecTurnoutRetroW6 == "No, did not vote" # something about turnout
besdat$did.not.vote.post[besdat$genElecTurnoutRetroW6 == "Don't know"] = NA # what is wrong with these people

#### Now: extract some predictors 


knowvars <- paste0("polKnow", c("Miliband", "Clegg", "Osborne", "May", "Bercow"))
correctanswers <- c("Leader of the Labour Party", "Deputy Prime Minister", 
                    "Chancellor of the Exchequer", "Home secretary", 
                    "Speaker of the House of Commons") 
knowdf <- besdat[,knowvars]
for(i in 1:length(knowvars)){
  knowdf[,i] <- as.numeric(knowdf[,i] == correctanswers[i])
}
names(knowdf) <- paste0(names(knowdf), "Dummy")

besdat$ukpolsKnowSum <- rowSums(knowdf, na.rm = TRUE)
besdat$ukpolsKnowMean <- rowMeans(knowdf, na.rm = TRUE)

besdat$ukpolknow <- besdat$ukpolsKnowMean



##### BES knowledge battery: international politicians

# identify variables
knowvars <- paste0("polKnow", c("Kerry", "Hollande", "Netanyahu", "Putin", "Merkel", "Assad"))
correctanswers <- c("United States Secretary of State", "President of France", "Prime Minister of Israel",
                    "President of Russia", "Chancellor of Germany", "President of Syria") 


## create knowledege variables containing information from all waves 
# Where we only have one answer for a respondent, take this. Where we have multiple answers for a respondent across waves, take the latest response.

getFirstAnswer <- function(x, the.correctanswer){ # function to get first response
  nonNAindex <- which(!is.na(x))
  firstNonNA <- nonNAindex[1]
  FirstResponse <- if(is.na(firstNonNA)){NA}else{ as.numeric(x[firstNonNA] ==  the.correctanswer)}
  return(FirstResponse)
}

for(i in 1:length(knowvars)){
  the.varname <- knowvars[i]
  the.correctanswer <- correctanswers[i]
  tmpdf <- besdat[,grep(the.varname, names(besdat))]
  nwaves <- ncol(tmpdf)
  besdat[paste0(the.varname, "FirstResponse")] <- apply(tmpdf, 1, getFirstAnswer, the.correctanswer = the.correctanswer)
}

# now summarise international knowledge scores, dropping those never get Hollande/Kerry/Netenyahu qs

besdat$intpolsKnowSum <- rowSums(besdat[,grep("FirstResponse", names(besdat))], na.rm = TRUE)
besdat$intpolsKnowSum[is.na(besdat$polKnowKerryFirstResponse)] <- NA
besdat$intpolsKnowMean <- rowMeans(besdat[,grep("FirstResponse", names(besdat))], na.rm = TRUE)
besdat$intpolsKnowMean[is.na(besdat$polKnowKerryFirstResponse)] <- NA

besdat$intpolknow <- besdat$intpolsKnowMean



##### BES knowledge battery: domestic and international politicians combined

besdat$polknow <- (besdat$ukpolsKnowSum + besdat$intpolsKnowSum)/11

# convert to low/high/medium categories
besdat$polknowcat <- quantcut(besdat$polknow, q=seq(0,1,length.out = 4), na.rm=TRUE,
                              labels = c("low", "med", "high"))
besdat$polknowlow <- as.numeric(besdat$polknowcat == "low")
besdat$polknowmed <- as.numeric(besdat$polknowcat == "med")
besdat$polknowhigh <- as.numeric(besdat$polknowcat == "high")


besdat$polknow.above.median <- as.numeric(besdat$polknow > median(besdat$polknow, na.rm = TRUE))


#### Knows MP or not

tmpdf <- besdat[, grep("knowMPW", names(besdat))]
besdat$mpknow <- apply(tmpdf, 1, getFirstAnswer, the.correctanswer = "Correct name")




#### Self-reported consumption of political news in media

# does the respondent spend any time at all getting political info from any source?
besdat$infosometime <- as.numeric(!(besdat$infoSourceTVW6 %in% c("None, no time at all", "Don't know")) |
                                    !(besdat$infoSourcePaperW6 %in% c("None, no time at all", "Don't know")) |
                                    !(besdat$infoSourceRadioW6 %in% c("None, no time at all", "Don't know"))  |
                                    !(besdat$infoSourceInternetW6 %in% c("None, no time at all", "Don't know")) |
                                    !(besdat$infoSourcePeopleW6 %in% c("None, no time at all", "Don't know"))
)

# does the respondent spend at least 30 mins or more getting political info from any source?
besdat$info30plus <- as.numeric(!(besdat$infoSourceTVW6 %in% c("None, no time at all", "Don't know", "Less than 1/2 hour")) |
                                  !(besdat$infoSourcePaperW6 %in% c("None, no time at all", "Don't know", "Less than 1/2 hour")) |
                                  !(besdat$infoSourceRadioW6 %in% c("None, no time at all", "Don't know", "Less than 1/2 hour"))  |
                                  !(besdat$infoSourceInternetW6 %in% c("None, no time at all", "Don't know", "Less than 1/2 hour")) |
                                  !(besdat$infoSourcePeopleW6 %in% c("None, no time at all", "Don't know", "Less than 1/2 hour"))
)


#### self-reported political attention 
# note, leave interest in election variable as measured on four levels and more than 50% of observations lie at maximum level.

besdat$attention <- as.numeric(besdat$polAttentionW6)-1
besdat$attention[besdat$polAttentionW6 == "Don't know"] <- NA
besdat$polattncat <- quantcut(besdat$attention, q=seq(0,1,length.out = 4), na.rm=TRUE,
                              labels = c("low", "med", "high"))
besdat$polattnlow <- as.numeric(besdat$polattncat == "low")
besdat$polattnmed <- as.numeric(besdat$polattncat == "med")
besdat$polattnhigh <- as.numeric(besdat$polattncat == "high")
besdat$polattn.above.median <- as.numeric(quantcut(besdat$attention, q=seq(0,1,length.out = 3), na.rm=TRUE,
                                                   labels = c("no", "yes")) == "yes")




#### Has the respondent been contacted by a party?
besdat$partycontact <- as.numeric(besdat$partyContact1newW6 == "Yes")



#### Personality traits (Big 5)

# these appear to be based on TIPI responses 
besdat$bigfiveagree <- arm:::rescale(besdat$personality_agreeableness)
besdat$bigfiveopen <- arm:::rescale(besdat$personality_openness)
besdat$bigfiveconsc <- arm:::rescale(besdat$personality_conscientiousness)
besdat$bigfiveextr <- arm:::rescale(besdat$personality_extraversion)
besdat$bigfiveneur <- arm:::rescale(besdat$personality_neuroticism)

besdat$bigfiveagree.above.median <- as.numeric(besdat$bigfiveagree > median(besdat$bigfiveagree, na.rm= TRUE))
besdat$bigfiveopen.above.median <- as.numeric(besdat$bigfiveopen > median(besdat$bigfiveopen, na.rm= TRUE))
besdat$bigfiveconsc.above.median <- as.numeric(besdat$bigfiveconsc > median(besdat$bigfiveconsc, na.rm= TRUE))
besdat$bigfiveextr.above.median <- as.numeric(besdat$bigfiveextr > median(besdat$bigfiveextr, na.rm= TRUE))
besdat$bigfiveneur.above.median <- as.numeric(besdat$bigfiveneur > median(besdat$bigfiveneur, na.rm= TRUE))




#### Gender
besdat$female <- as.numeric(besdat$gender == "Female")


#### Agegroup

# three groups
besdat$age = besdat$Age
besdat$agegrp <- factor(3 - (besdat$Age <= 29) - (besdat$Age <= 59),
                        labels = c("below30", "30to59", "60plus"))
besdat$agegrp.alt <- factor(3 - (besdat$Age <= 29) - (besdat$Age <= 49),
                            labels = c("below30", "30to49", "50plus"))
# dummy for 50+
besdat$agefiftyplus <- as.numeric(besdat$Age >= 50)


#### Income group

# note several don't knows and prefer not to answers here

### income
besdat$income.raw <- besdat$profile_gross_household  # bq96 # gsub("Â£","",
besdat$income.processed = gsub("[^\\-\\d]", "", gsub(" to ", " - ", besdat$income.raw), perl = T) # as.character(besdat$profile_gross_household))
besdat$incomealt <- car:::recode(besdat$income.processed, 
                                 'c("5000", "5000-9999", "10000-14999", "15000-19999") = "low";
                                 c("20000-24999", "25000-29999", "30000-34999", "35000-39999", "40000-44999", "45000-49999") = "med"; c("50000-59999", "60000-69999", "70000-99999", "100000-149999", "150000") = "high"; c("dk", "na") = NA')
besdat$incomealt[!besdat$incomealt %in% c("low", "med", "high")] = NA
besdat$incomealtlow = besdat$incomealt == "low"
besdat$incomealtmed = besdat$incomealt == "med"
besdat$incomealthigh = besdat$incomealt == "high"
besdat$incomealtlow[is.na(besdat$incomealt)] = besdat$incomealtmed[is.na(besdat$incomealt)] = besdat$incomealthigh[is.na(besdat$incomealt)] = NA

besdat$incomeband = as.integer(gsub("\\-.+", "", besdat$income.processed))
besdat$income.above.median = besdat$incomeband > median(besdat$incomeband, na.rm = T)
besdat$income.above.median[is.na(besdat$incomeband)] = NA

besdat$income <- quantcut(besdat$incomeband, q = 3, na.rm = TRUE)
besdat$incomelow <- besdat$income == levels(besdat$income)[1]
besdat$incomemed <- besdat$income == levels(besdat$income)[2]
besdat$incomehigh <- besdat$income == levels(besdat$income)[3]




#### Education level

### Education

besdat$education.bak <- besdat$education
besdat$qualifications <- car:::recode(besdat$education,
                                      "'No formal qualifications'='No qualifications';
                                      'Youth training certificate/skillseekers'='Level 2';
                                      'Recognised trade apprenticeship completed'='Level 2';
                                      'Clerical and commercial'='Level 1';
                                      'City and Guild certificate'='Level 1';
                                      'City and Guild certificate - advanced'='Level 2';
                                      'onc'='Level 2';
                                      'CSE grades 2-5'='Level 1';
                                      'CSE grade 1, GCE O level, GCSE, School Certificate'='Level 2';
                                      'Scottish Ordinary/ Lower Certificate'='Level 2';
                                      'GCE A level or Higher Certificate'='Level 3';
                                      'Scottish Higher Certificate'='Level 3';
                                      'Nursing qualification (eg SEN, SRN, SCM, RGN)'='Level 4/5';
                                      'Teaching qualification (not degree)'='Level 4/5';
                                      'University diploma'='Level 4/5';
                                      'University or CNAA first degree (eg BA, B.Sc, B.Ed)'='Level 4/5';
                                      'University or CNAA higher degree (eg M.Sc, Ph.D)'='Level 4/5';
                                      'Other technical, professional or higher qualification'='Other';else = NA")

# table(besdat$qualifications, useNA="ifany")

# dummy for degree or higher
besdat$educuni <- as.numeric(besdat$qualifications == "Level 4/5")

# dummy for level 3 or higher
besdat$educl3plus <- as.numeric(besdat$qualifications %in% c("Level 4/5", "Level 3"))




#### Now code extra relevant attitudinal variables included only in 2015 survey



#### How likely that vote makes difference in constituency? (Only measured Wave 4)
#### "How likely is it that your vote will make a difference in terms of which party wins the election in your local constituency?" (11-point scale)

besdat$efficvote <- as.numeric(besdat$voteMakesDifferenceW4) - 1
besdat$efficvote[besdat$voteMakesDifferenceW4 %in% c("Don't know", NA)] <- NA


#### "People should vote for the party they like the most, even if it's not likely to win" (agree/disagree, five-point scale)

besdat$attitude_votesincere <- as.numeric(besdat$smallVoterPrefW6)
besdat$attitude_votesincere[besdat$smallVoterPrefW6 %in% c("Don't know", NA)] <- NA


#### "People who vote for small parties are throwing away their vote"  (agree/disagree, five-point scale)

besdat$attitude_votesmall <- as.numeric(besdat$smallPartyWastedVoteW6)
besdat$attitude_votesmall[besdat$smallPartyWastedVoteW6 %in% c("Don't know", NA)] <- NA




##### Make sincere/strategic index

# cor(besdat$attitude_votesmall, besdat$attitude_votesincere,  use ="complete.obs")
besdat$attitude_stratindex <- (besdat$attitude_votesmall + (6 - besdat$attitude_votesincere))/2

besdat$stratindex.above.median <- as.numeric(besdat$attitude_stratindex > median(besdat$attitude_stratindex, na.rm = TRUE))

##### Other internal/external efficacy questions that could be coded up

#I have a pretty good understanding of the important political issues facing our (efficacyUnderstandW6)
#It takes too much time and effort to be active in politics and public affairs (efficacyTooMuchEffortW6)
#It is often difficult for me to understand what is going on in government and po (efficacyNotUnderstandW6)
#Politicians don't care what people like me think (efficacyPolCareW6)
#Going to vote is a lot of effort (efficacyVoteEffortW6)
#Voting is an enjoyable experience (efficacyEnjoyVoteW6)



##### Left-right measure
#In politics people sometimes talk of left and right. Where would you place yourself on the following scale?
#max 10
#min 0
#dk 9999
besdat$leftRight <- as.numeric(besdat$leftRightW6)
besdat$leftRight[besdat$leftRight>11] <- NA # treat Don't know (=12) as NA
# re-center at 0
besdat$leftRight <- besdat$leftRight - 6

# and group
besdat$lrgroup <- car:::recode(besdat$leftRight,
                               "-5:-2 = 'left';
                               -1:1 = 'center';
                               2:5 = 'right'",
                               as.factor = TRUE,
                               levels = c("left","center","right"))

besdat$lrgroupleft <- besdat$lrgroup == "left"
besdat$lrgroupcenter <- besdat$lrgroup == "center"
besdat$lrgroupright <- besdat$lrgroup == "right"


# localism importance
besdat$localism_importance = as.integer(gsub("[a-zA-Z \\-]", "", as.character(besdat$localismImportanceW4)))
# recode this 



#### See Nick memo for some summaries, shortcomings of these. 

besdat$reasonForVote = besdat$reasonForVoteW6
besdat$reasonForVoteOth = besdat$ReasonForvoteOthW6
# they didn't ask who you really preferred! clowns. 
besdat$partyReallyPrefer = besdat$otherPartyReallyPrefer = NA

besdat$refno = besdat$pano
besdat$region = besdat$Region
besdat$year = 2015

d2015_big = besdat 
to.keep = c(1:28, which(names(besdat) %in% c("age", "Region", "reasonForVote", "reasonForVoteOth")),
            which(names(besdat) == "con10"):ncol(besdat))
d2015 = besdat[,to.keep]



#######################
##### Combining #######
#######################

parties = c("lab", "con", "ld", "ukip", "grn", "snp", "pc")

vars = c("id", "refno", "female", "educuni", "educl3plus", 
         "age", "agegrp", "agegrp.alt",
         "income", "incomelow", "incomemed", "incomehigh", 
         "incomealt", "incomealtlow", "incomealtmed", "incomealthigh", 
         "partycontact", "vote.post", "did.not.vote.post",
         # "mf.tactical", "vote.code", "ui", 
         paste0(parties, "feel"),
         paste0(parties, "feelpost"),
         paste0(parties, "leaderfeel"),
         paste0(parties, "leaderfeelpost"),
         "year", "region",  
         "qualifications", "income.above.median", "agefiftyplus", "reasonForVote", "reasonForVoteOth", "partyReallyPrefer", "otherPartyReallyPrefer"#)
         , "efficvote", "attitude_votesincere", "attitude_votesmall", "attitude_stratindex"
         , "bigfiveagree", "bigfiveopen", "bigfiveconsc", "bigfiveextr", "bigfiveneur"
         , "bigfiveagree.above.median", "bigfiveopen.above.median", "bigfiveconsc.above.median", "bigfiveextr.above.median", "bigfiveneur.above.median"
         , "attention", "polattn.above.median", "stratindex.above.median", "polknow", "polknow.above.median"
         , "leftRight", "lrgroup", "lrgroupleft", "lrgroupcenter", "lrgroupright"
         , "taxspend", "taxspendgroup", "taxspendgroupleft", "taxspendgroupcenter", "taxspendgroupright",
         paste0(parties, "_win_campaign"), paste0(parties, "_win_precampaign"), "partyID", "partyIDpost", "partyIDsqueeze", "partyIDsqueezepost", "partyIDstrength", "partyIDstrengthpost", "localism_importance"
) 

# create columns when they don't exist, e.g. when a variable is only in one wave of the survey
for(var in vars){
  if(!var %in% names(d2005)){
    cat(var, " not in 2005 BES data.\n")
    d2005[[var]] = NA
  }
  if(!var %in% names(d2010)){
    cat(var, " not in 2010 BES data.\n")
    d2010[[var]] = NA
  }
  if(!var %in% names(d2015)){
    cat(var, " not in 2015 BES data.\n")
    d2015[[var]] = NA
  }
}

D = rbind(d2015[,vars], d2010[,vars], d2005[,vars])
D$country = "England"
D$country[grepl("[Ss]cotland", D$region)] = "Scotland"
D$country[grepl("[Ww]ales", D$region)] = "Wales"

### 

write.csv(D, "BES_2005_2010_2015_combined.csv", row.names = F)

#################################################################################
###### end basic data processing, i.e. pulling data out of the raw BES files #### 
#################################################################################

### see processing_combined_bes_data_v1.R



